* This script performs all the analyses reported in Spamann's comment on Cho et al., in particular Spamann's table 1, plus additional probings of Cho et al.'s data
* This script was written for Stata 14 and uses user-written packages estout (for the tables) and reghdfe (for analyzing TRAC data)
* To run this script in full, you need access to
*** a) USSCdata.dta (pre-assembled USSC data available online on Dataverse at http://dx.doi.org/10.7910/DVN/TZRNKD, or assemble yourself using script and raw data provided there)
*** b) TRACdata.dta (pre-assembled TRAC data available to researchers from TRAC subscriber institutions by emailing trac@syr.edu, or assemble yourself using script and raw data also deposited with TRAC)
*** c) MoralPunishment_R2_Level1_sharedwithcoauthor.dta (Cho et al.'s data set in Stata format provided to me by Kyoungmin Cho)
* IF YOU DO NOT HAVE ACCESS TO ALL THE AFOREMENTIONED DATA, COMMENT OUT THE RELEVANT PARTS OF THE CODE
* (NB: I have marked many of the results from investigating Cho et al.'s data in comments to the code below so that the code can still be useful even if Cho et al.'s data are unavailable)

* INSTRUCTIONS how to run this code:
* 1. Create a directory X with (at least) two folders: one called "data" holding the 3 data files mentioned above, and one called "results" to receive the output of this script
* 2. Point the cd command in line 14 below to your directory X 

cd C:/Users/hspamann/Documents/Projects/In_progress/sleepy_punishers // PC (on Mac, replace C:/Users/hspamann/ with /Users/holgerspamann/)
global controls "sentyr crmhst? offensl trial noconvic offage gender edu?" // variable list from Cho email to Spamann on 12/26/2016

set more off
clear matrix
cap estimates drop *

log using results/Spamann_commment_Cho_analysis.smcl, replace

********************************************************************************
********************************************************************************
**** I. Original Cho et al. data: investigation; models 1 and 4 of Spamann's table 1

use Distr sentdate sentyr senttot SENTTOT0 sensplt SENSPLT0 csenttt CSENTTT0 lsenttt CrmHst? OffensL Trial NoConvic Offage gender race EDU? xcrhissr xfolsor DSPTWOMs if DSPTWOMs<. ///
	using data/MoralPunishment_R2_Level1_sharedwithcoauthor.dta, clear
rename *, lower
misstable pattern lsenttt dsptwoms distr $controls race, frequency // only 2,985 complete observations

* Demonstration that Cho et al.'s main dependent variable (DV) lsenttt is ln(1 + sentence length) excluding zeroes
gen ln_csenttt = ln(csenttt) // = log of sentence length (csenttt is directly from USSC)
gen lsenttt1 = ln(1+csenttt) // = log of 1 + sentence length, excluding zeroes
gen lsenttt01 = ln(1+csenttt0) // = log of 1 + sentence length, including zeroes (csenttt0 is also directly from the USSC -- cf. USSC's codebook)
correl lsenttt* ln_csenttt // perfect correlation of lsenttt with the log(1+) transformation, not with the simple log transformation
misstable pattern lsenttt lsenttt01 csenttt csenttt0 // observations with missing value on csenttt, or 0 on csenttt0, are missing in lsenttt too, i.e., Cho et al.'s main DV excludes sentences of length zero

* Kolmogorov-Smirnov test of normality of Cho et al.'s main dependent variable lsenttt
qui reg lsenttt dsptwoms $controls race distr // quick method to mark the observations in the sample
gen sample = e(sample)
qui sum lsenttt if sample // to obtain the mean and standard deviation of the empirical distribution
ksmirnov lsenttt = normal((lsenttt-r(mean))/r(sd)) if sample
disp r(p) // .00007297

* replication of Cho et al.'s main test (model 2 of their table 1 = model 1 of Spamann's table 1), plus permutations of the dependent variable (incl. Spamann T.1 model 4)
_eststo Cho_original	: mixed lsenttt   dsptwoms $controls race || distr: dsptwoms $controls race // Spamann T.1 model 1 (= Cho et al. T.1 model 2)
_eststo Cho_withzero	: mixed lsenttt01 dsptwoms $controls race || distr: dsptwoms $controls race // Spamann T.1 model 4
_eststo Cho_log			: mixed ln_csentt dsptwoms $controls race || distr: dsptwoms $controls race // shows: using ln(s) instead of ln(1+s) in and of itself weakens Cho et al.'s result


********************************************************************************
********************************************************************************
**** II. Investigation of discrepancies between Cho et al.'s data and USSC data

* prep USSC data (as previously assembled, see script preamble) for merge
tempfile USSC
use if !newcit & (DSTbegin | pre_DSTbegin | post_DSTbegin) using data/USSCdata.dta, clear
duplicates drop distr sentdate senttot senttot0 sensplt sensplt0 trial noconvic xcrhissr xfolsor offage edu? gender race, force // 11 surplus observations (14 if omit race, 16 if also omit edu?)
save `USSC'
rename * *_ussc
merge 1:1 _n using `USSC', assert(3) nogenerate // this duplicates every variable to a _ussc version, which is useful for comparison with Cho et al. data after merging those two
save `USSC', replace

* merge USSC data into Cho: merge on all original variables that are in Cho et al.'s data, plus edu? (derived from neweduc) and binary noconvic (5 duplicates to be dropped)
use Distr sentdate sentyr senttot SENTTOT0 sensplt SENSPLT0 csenttt CSENTTT0 lsenttt CrmHst? OffensL Trial NoConvic Offage gender race EDU? xcrhissr xfolsor DSPTWOMs if DSPTWOMs<. ///
	using data/MoralPunishment_R2_Level1_sharedwithcoauthor.dta, clear
rename *, lower
duplicates drop distr sentdate senttot senttot0 sensplt sensplt0 trial noconvic xcrhissr xfolsor offage edu? gender race, force // 5 surplus obs.
merge 1:1 distr sentdate senttot senttot0 sensplt sensplt0 trial noconvic xcrhissr xfolsor offage edu? gender race using `USSC', noupdate

* analyze match (= missing observations --> almost 200)
tab _merge if !mi(distr, sentdate, senttot, trial, noconvic, xcrhissr, xfolsor, offage, edu1, edu2, edu3, race) // looking only at observations that can be useable, given ... original variables
tab _merge if !mi(distr, lsenttt, sentyr, dsptwoms, crmhst1, crmhst2, crmhst3, crmhst4, crmhst5, offensl, trial, noconvic, offage, gender, race, edu1, edu2, edu3) // ... transformed variables (in particular, recoded xfolsor (93/99 = .))
/* comment on merge: only 13 observations from Cho et al. without missing regression variables are not matched (plus 5 surplus observations dropped before merge) */
/* but the other way around: 199 USSC obs are unmatched -- why? */
bys _merge: tab pettybc // unmatched observations from USSC don't contain a higher fraction of cases including petty offenses (a possibility Cho et al. suggested) (there are very few such cases)
by _merge: tab sentyr if !mi(race_ussc) & newcit_ussc==0 // there is no year pattern either
sum $controls prison senttot sleepy if _merge==2 & !mi(distr, sentdate, senttot, trial, noconvic, xcrhissr, xfolsor, offage, edu1, edu2, edu3, race)

* analyze if variable values derived directly from USSC correspond to those in Cho et al. (--> coding issues -- few)
correl lsenttt logsent1 // =1: verifies that Cho et al.'s logsent is ln(1+csenttot)
correl dsptwoms sleepy // correlation is 1
count if dsptwoms!=sleepy & _merge==3 // zero
foreach var of varlist sentyr crmhst? offensl { // don't need to check variables on which I matched: trial noconvic offage gender edu? race
	correl `var' `var'_ussc // correlation is 1 for all variables
	count if `var'!=`var'_ussc & _merge==3 // zero except for crmhst? (see below)
	}
tab xcrhissr if crmhst1!=crmhst1_ussc & _merge==3 // 186 missing value codes (9); 11 codes "N/A or conviction solely under 18$924(c)" (8)
count if crmhst1!=crmhst1_ussc & _merge==3 & mi(xcrhissr) // 7 observations
sum crmhst? crmhst?_ussc if crmhst1!=crmhst1_ussc & _merge==3 // for all these 204 observations, crmhst? are zero even though they shouldn't (see two lines above), whereas crmhst?_ussc are correctly coded as missing
misstable sum lsenttt dsptwoms $controls race distr if crmhst1!=crmhst1_ussc & _merge==3 // but offensl is missing for all but 2 of the 204 observations with the erroneous zeroes instead of missing in Cho's crmhst?
/* bottom line: all values identical except crmhst?, which in Cho et al. has zero instead of missing for 204 obs. (but only for 2 obs. that matter, i.e., that are in the regression sample) */


********************************************************************************
********************************************************************************
**** III. USSC data: models 2, 3, 5, and 6 of Spamann's table 1 (plus additional robustness checks)

**** models 2 & 3 and similar (3 Mondays only, as in Cho et al.) (most are commented out to speed up the code -- "mixed" is slow)
use if (DSTbegin | pre_DSTbegin | post_DSTbegin) using data/USSCdata.dta, clear

* same models as lines 47-49 above, but using USSC data instead of Cho et al.'s 
_eststo USSC_original	: mixed logsent1 sleepy $controls race	   if newcit==0	& logsent1>0	|| distr: sleepy $controls race, iter(60) // this has 184 more observations than Cho et al. (= Spamann T.1 model 1), otherwise unchanged (using "race" implicitly excludes hispanics)
*_eststo USSC_withzero	: mixed logsent1 sleepy $controls race	   if newcit==0					|| distr: sleepy $controls race, iter(60)
_eststo USSC_log		: mixed logsent  sleepy $controls race	   if newcit==0					|| distr: sleepy $controls race, iter(60)

* same models as lines 47-49 above, but using USSC data AND including hispanics (by using mrace_bw instead of race)
_eststo USSCh_original	: mixed logsent1 sleepy $controls mrace_bw if newcit==0 & logsent1>0	|| distr: sleepy $controls mrace_bw, iter(60)
*_eststo USSCh_withzero	: mixed logsent1 sleepy $controls mrace_bw if newcit==0					|| distr: sleepy $controls mrace_bw, iter(60)
_eststo USSCh_log		: mixed logsent  sleepy $controls mrace_bw if newcit==0					|| distr: sleepy $controls mrace_bw, iter(60) // Spamann T.1 model 2

* same, but now also including foreigners and controlling for hispanic
*_eststo USSCf_original : mixed logsent1 sleepy $controls mrace? hispanic newcit if logsent1>0	|| distr: sleepy $controls mrace? hispanic newcit, iter(60)
*_eststo USSCf_withzero	: mixed logsent1 sleepy $controls mrace? hispanic newcit				|| distr: sleepy $controls mrace? hispanic newcit, iter(60)
_eststo USSCf_log		: mixed logsent sleepy $controls mrace? hispanic newcit					|| distr: sleepy $controls mrace? hispanic newcit, iter(60) // Spamann T.1 model 3

**** models 5 and 6 and similar (all days; cf. C. Yang, Journal of Legal Studies, 44:75-111 (2015))
use data/USSCdata.dta, clear
set matsize 500
gen statminimum = cstatmin>0 & cstatmin<.
local fe "i.sentyr i.dow i.month crimhist##offensl i.offtype2"
local othercontrols "trial noconvic offage offage2 gender edu? mrace? hispanic newcit dependents statminimum"
_eststo Yang_log	: xtreg logsent  sleepy `othercontrols' `fe' if offensl<44, fe cluster(distr) // Maximum offense level in sentencing grid is 43; 590 obs. >43 seem data entry error
_eststo Yang_binary	: xtreg prison   sleepy `othercontrols' `fe' if offensl<44, fe cluster(distr)
_eststo Yang_level	: xtreg senttot0 sleepy `othercontrols' `fe' if offensl<44, fe cluster(distr)


*****************************************************************
*****************************************************************
**** IV. TRAC data (out of sample tests)

use data/TRACdata.dta, clear
_eststo TRAC_log	: reghdfe logprison sleepy trial if outofsample, absorb(district judge leadcharge sentyr dow month) cluster(district) // controlling for both judge and district because some judges appear in multiple districts (~22k cases of "travelling" judes). Also, missing judge data is one judge category
_eststo TRAC_binary : reghdfe prison    sleepy trial if outofsample, absorb(district judge leadcharge sentyr dow month) cluster(district)
_eststo TRAC_level	: reghdfe cprison   sleepy trial if outofsample, absorb(district judge leadcharge sentyr dow month) cluster(district)


*****************************************************************
*****************************************************************
**** Conclusion: Regression output tables

set matsize 1000
esttab Cho* USSC* Yang* TRAC* using results/Spamann_comment_b_se.csv, replace ///
	keep(dsptwoms sleepy sentyr crmhst? offensl trial noconvic offage gender edu? race newrace? mrace? hispanic newcit) ///
	order(dsptwoms sleepy sentyr crmhst? offensl trial noconvic offage age2 gender race newrace? mrace? hispanic edu? newcit) b(3) se(3) scalars(converged) sfmt(1) nolines nogaps
	
esttab Cho* USSC* Yang* TRAC* using results/Spamann_comment_p.csv, replace ///
	keep(dsptwoms sleepy) p(3) nolines nogaps
	
esttab Cho_original USSCh_log USSCf_log Cho_withzero Yang_log Yang_binary TRAC_log TRAC_binary using results/table1.csv, replace ///
	keep(dsptwoms sleepy) se(3) nolines nogaps

log close
